library(tidyverse)
library(AmesHousing)
library(recipes)
library(caret)
library(rpart)
library(rpart.plot)
library(ranger)
library(xgboost)
library(AUC)
data("credit_data")
set.seed(42)
credit_data <- credit_data %>%
mutate(
base = if_else(runif(nrow(credit_data)) < 0.7, "treino", "teste")
)
receita <- recipe(Status ~ ., data = credit_data %>% filter(base == "treino") %>% select(-base)) %>%
step_meanimpute(all_numeric(), -all_outcomes()) %>%
step_modeimpute(all_nominal(), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_corr(all_predictors()) %>%
step_nzv(all_predictors())
getModelInfo("rpart", FALSE)$rpart
$label
[1] "CART"
$library
[1] "rpart"
$type
[1] "Regression" "Classification"
$parameters
$grid
function(x, y, len = NULL, search = "grid"){
dat <- if(is.data.frame(x)) x else as.data.frame(x)
dat$.outcome <- y
initialFit <- rpart::rpart(.outcome ~ .,
data = dat,
control = rpart::rpart.control(cp = 0))$cptable
initialFit <- initialFit[order(-initialFit[,"CP"]), , drop = FALSE]
if(search == "grid") {
if(nrow(initialFit) < len) {
tuneSeq <- data.frame(cp = seq(min(initialFit[, "CP"]),
max(initialFit[, "CP"]),
length = len))
} else tuneSeq <- data.frame(cp = initialFit[1:len,"CP"])
colnames(tuneSeq) <- "cp"
} else {
tuneSeq <- data.frame(cp = unique(sample(initialFit[, "CP"], size = len, replace = TRUE)))
}
tuneSeq
}
$loop
function(grid) {
grid <- grid[order(grid$cp, decreasing = FALSE),, drop = FALSE]
loop <- grid[1,,drop = FALSE]
submodels <- list(grid[-1,,drop = FALSE])
list(loop = loop, submodels = submodels)
}
$fit
function(x, y, wts, param, lev, last, classProbs, ...) {
cpValue <- if(!last) param$cp else 0
theDots <- list(...)
if(any(names(theDots) == "control"))
{
theDots$control$cp <- cpValue
theDots$control$xval <- 0
ctl <- theDots$control
theDots$control <- NULL
} else ctl <- rpart::rpart.control(cp = cpValue, xval = 0)
## check to see if weights were passed in (and availible)
if(!is.null(wts)) theDots$weights <- wts
modelArgs <- c(list(formula = as.formula(".outcome ~ ."),
data = if(is.data.frame(x)) x else as.data.frame(x),
control = ctl),
theDots)
modelArgs$data$.outcome <- y
out <- do.call(rpart::rpart, modelArgs)
if(last) out <- rpart::prune.rpart(out, cp = param$cp)
out
}
$predict
function(modelFit, newdata, submodels = NULL) {
if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
pType <- if(modelFit$problemType == "Classification") "class" else "vector"
out <- predict(modelFit, newdata, type=pType)
if(!is.null(submodels))
{
tmp <- vector(mode = "list", length = nrow(submodels) + 1)
tmp[[1]] <- out
for(j in seq(along = submodels$cp))
{
prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
tmp[[j+1]] <- predict(prunedFit, newdata, type=pType)
}
out <- tmp
}
out
}
$prob
function(modelFit, newdata, submodels = NULL) {
if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
out <- predict(modelFit, newdata, type = "prob")
if(!is.null(submodels))
{
tmp <- vector(mode = "list", length = nrow(submodels) + 1)
tmp[[1]] <- out
for(j in seq(along = submodels$cp))
{
prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
tmpProb <- predict(prunedFit, newdata, type = "prob")
tmp[[j+1]] <- as.data.frame(tmpProb[, modelFit$obsLevels, drop = FALSE])
}
out <- tmp
}
out
}
$predictors
function(x, surrogate = TRUE, ...) {
out <- as.character(x$frame$var)
out <- out[!(out %in% c("<leaf>"))]
if(surrogate)
{
splits <- x$splits
splits <- splits[splits[,"adj"] > 0,]
out <- c(out, rownames(splits))
}
unique(out)
}
$varImp
function(object, surrogates = FALSE, competes = TRUE, ...) {
if(nrow(object$splits)>0) {
tmp <- rownames(object$splits)
rownames(object$splits) <- 1:nrow(object$splits)
splits <- data.frame(object$splits)
splits$var <- tmp
splits$type <- ""
frame <- as.data.frame(object$frame)
index <- 0
for(i in 1:nrow(frame)) {
if(frame$var[i] != "<leaf>") {
index <- index + 1
splits$type[index] <- "primary"
if(frame$ncompete[i] > 0) {
for(j in 1:frame$ncompete[i]) {
index <- index + 1
splits$type[index] <- "competing"
}
}
if(frame$nsurrogate[i] > 0) {
for(j in 1:frame$nsurrogate[i]) {
index <- index + 1
splits$type[index] <- "surrogate"
}
}
}
}
splits$var <- factor(as.character(splits$var))
if(!surrogates) splits <- subset(splits, type != "surrogate")
if(!competes) splits <- subset(splits, type != "competing")
out <- aggregate(splits$improve,
list(Variable = splits$var),
sum,
na.rm = TRUE)
} else {
out <- data.frame(x = numeric(), Vaiable = character())
}
allVars <- colnames(attributes(object$terms)$factors)
if(!all(allVars %in% out$Variable)) {
missingVars <- allVars[!(allVars %in% out$Variable)]
zeros <- data.frame(x = rep(0, length(missingVars)),
Variable = missingVars)
out <- rbind(out, zeros)
}
out2 <- data.frame(Overall = out$x)
rownames(out2) <- out$Variable
out2
}
$levels
function(x) x$obsLevels
$trim
function(x) {
x$call <- list(na.action = (x$call)$na.action)
x$x <- NULL
x$y <- NULL
x$where <- NULL
x
}
$tags
[1] "Tree-Based Model" "Implicit Feature Selection"
[3] "Handle Missing Predictor Data" "Accepts Case Weights"
$sort
function(x) x[order(x[,1], decreasing = TRUE),]
train_control_rpart <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
verboseIter = 1
)
# DICA: rode
# info <- getModelInfo("rpart", FALSE)$rpart
# info$parameters
grid_rpart <- data.frame(
cp = seq(-0.001, 0.01, by= 0.0001)
)
modelo_rpart <- train(
receita,
credit_data %>% filter(base == "treino") %>% select(-base),
method = "rpart",
metric = "ROC",
trControl = train_control_rpart,
tuneGrid = grid_rpart
)
Preparing recipe
+ Fold1: cp=-0.001
- Fold1: cp=-0.001
+ Fold2: cp=-0.001
- Fold2: cp=-0.001
+ Fold3: cp=-0.001
modelo_rpart
CART
3070 samples
13 predictor
2 classes: 'bad', 'good'
Recipe steps: meanimpute, modeimpute, dummy, corr, nzv
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 2456, 2455, 2457, 2457, 2455
Resampling results across tuning parameters:
cp ROC Sens Spec
-0.0010 0.7385880 0.4976746 0.8292404
-0.0009 0.7385880 0.4976746 0.8292404
-0.0008 0.7385880 0.4976746 0.8292404
-0.0007 0.7385880 0.4976746 0.8292404
-0.0006 0.7385880 0.4976746 0.8292404
-0.0005 0.7385880 0.4976746 0.8292404
-0.0004 0.7385880 0.4976746 0.8292404
-0.0003 0.7385880 0.4976746 0.8292404
-0.0002 0.7385880 0.4976746 0.8292404
-0.0001 0.7385880 0.4976746 0.8292404
0.0000 0.7530080 0.4838416 0.8419563
0.0001 0.7530080 0.4838416 0.8419563
0.0002 0.7530080 0.4838416 0.8419563
0.0003 0.7508616 0.4815428 0.8428633
0.0004 0.7497139 0.4792373 0.8460431
0.0005 0.7459665 0.4838350 0.8460431
0.0006 0.7468441 0.4815228 0.8474067
0.0007 0.7468441 0.4815228 0.8474067
0.0008 0.7527045 0.4734768 0.8619470
0.0009 0.7527045 0.4734768 0.8619470
0.0010 0.7512653 0.4734768 0.8637652
0.0011 0.7519638 0.4757757 0.8646722
0.0012 0.7519638 0.4757757 0.8646722
0.0013 0.7519638 0.4757757 0.8646722
0.0014 0.7519638 0.4757757 0.8646722
0.0015 0.7546083 0.4758355 0.8728458
0.0016 0.7546083 0.4758355 0.8728458
0.0017 0.7546083 0.4758355 0.8728458
0.0018 0.7546083 0.4758355 0.8728458
0.0019 0.7546083 0.4758355 0.8728458
0.0020 0.7513883 0.4723939 0.8728458
0.0021 0.7513883 0.4723939 0.8728458
0.0022 0.7520362 0.4781676 0.8737580
0.0023 0.7520362 0.4781676 0.8737580
0.0024 0.7515843 0.4793236 0.8742125
0.0025 0.7502133 0.4804731 0.8769336
0.0026 0.7503254 0.4804731 0.8778407
0.0027 0.7477346 0.4701282 0.8792012
0.0028 0.7477346 0.4701282 0.8792012
0.0029 0.7503338 0.4563218 0.8837374
0.0030 0.7503338 0.4563218 0.8837374
0.0031 0.7503338 0.4563218 0.8837374
0.0032 0.7503338 0.4563218 0.8837374
0.0033 0.7503338 0.4563218 0.8837374
0.0034 0.7507174 0.4563218 0.8837374
0.0035 0.7503456 0.4540097 0.8855556
0.0036 0.7511207 0.4540097 0.8846465
0.0037 0.7465225 0.4413660 0.8923593
0.0038 0.7465225 0.4413660 0.8923593
0.0039 0.7408048 0.4367550 0.8932684
0.0040 0.7335255 0.4309747 0.8964502
0.0041 0.7335255 0.4309747 0.8964502
0.0042 0.7335255 0.4309747 0.8964502
0.0043 0.7335255 0.4309747 0.8964502
0.0044 0.7336564 0.4240516 0.8946403
0.0045 0.7336564 0.4240516 0.8946403
0.0046 0.7320066 0.4206033 0.8973614
0.0047 0.7320066 0.4206033 0.8973614
0.0048 0.7313576 0.4206033 0.8982705
0.0049 0.7290796 0.4160056 0.8996341
0.0050 0.7290796 0.4160056 0.8996341
0.0051 0.7286288 0.4343964 0.8896547
0.0052 0.7286288 0.4343964 0.8896547
0.0053 0.7297652 0.4286493 0.8928293
0.0054 0.7297652 0.4286493 0.8928293
0.0055 0.7297652 0.4286493 0.8928293
0.0056 0.7297652 0.4286493 0.8928293
0.0057 0.7297652 0.4286493 0.8928293
0.0058 0.7298794 0.4147831 0.8964616
0.0059 0.7298794 0.4147831 0.8964616
0.0060 0.7298794 0.4147831 0.8964616
0.0061 0.7298794 0.4147831 0.8964616
0.0062 0.7298794 0.4147831 0.8964616
0.0063 0.7298794 0.4147831 0.8964616
0.0064 0.7298794 0.4147831 0.8964616
0.0065 0.7302964 0.4216796 0.8928334
0.0066 0.7302964 0.4216796 0.8928334
0.0067 0.7302964 0.4216796 0.8928334
0.0068 0.7302964 0.4216796 0.8928334
0.0069 0.7172084 0.4090359 0.9000897
0.0070 0.7172084 0.4090359 0.9000897
0.0071 0.7172084 0.4090359 0.9000897
0.0072 0.7164385 0.4067238 0.8982715
0.0073 0.7177774 0.4228158 0.8955442
0.0074 0.7177774 0.4228158 0.8955442
0.0075 0.7177774 0.4228158 0.8955442
0.0076 0.7177774 0.4228158 0.8955442
0.0077 0.7177774 0.4228158 0.8955442
0.0078 0.7177774 0.4228158 0.8955442
0.0079 0.7177774 0.4228158 0.8955442
0.0080 0.7177774 0.4228158 0.8955442
0.0081 0.7177774 0.4228158 0.8955442
0.0082 0.7177774 0.4228158 0.8955442
0.0083 0.7177774 0.4228158 0.8955442
0.0084 0.7177774 0.4228158 0.8955442
0.0085 0.7177774 0.4228158 0.8955442
0.0086 0.7177774 0.4228158 0.8955442
0.0087 0.6905127 0.3848914 0.9023645
0.0088 0.6905127 0.3848914 0.9023645
0.0089 0.6905127 0.3848914 0.9023645
0.0090 0.6905127 0.3848914 0.9023645
0.0091 0.6905127 0.3848914 0.9023645
0.0092 0.6905127 0.3848914 0.9023645
0.0093 0.6905127 0.3848914 0.9023645
0.0094 0.6881358 0.3641153 0.9078149
0.0095 0.6881358 0.3641153 0.9078149
0.0096 0.6881358 0.3641153 0.9078149
0.0097 0.6881358 0.3641153 0.9078149
0.0098 0.6881358 0.3641153 0.9078149
0.0099 0.6881358 0.3641153 0.9078149
0.0100 0.6881358 0.3641153 0.9078149
ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.0019.
modelo_rpart$bestTune
varImp(modelo_rpart)
rpart variable importance
plot(modelo_rpart)
# apenas para arvores
rpart.plot(modelo_rpart$finalModel)
pdf("arvore.pdf", 20, 10)
rpart.plot(modelo_rpart$finalModel)
dev.off()
png
2
# Matriz de confusão
credit_data <- credit_data %>%
mutate(
pred_rpart = predict(modelo_rpart, ., type = "prob")$bad
)
credit_data_teste <- credit_data %>% filter(base %in% "teste")
caret::confusionMatrix(
predict(modelo_rpart, credit_data_teste),
credit_data_teste$Status,
mode = "everything"
)
Confusion Matrix and Statistics
Reference
Prediction bad good
bad 185 119
good 201 879
Accuracy : 0.7688
95% CI : (0.7457, 0.7908)
No Information Rate : 0.7211
P-Value [Acc > NIR] : 3.213e-05
Kappa : 0.3851
Mcnemar's Test P-Value : 5.953e-06
Sensitivity : 0.4793
Specificity : 0.8808
Pos Pred Value : 0.6086
Neg Pred Value : 0.8139
Precision : 0.6086
Recall : 0.4793
F1 : 0.5362
Prevalence : 0.2789
Detection Rate : 0.1337
Detection Prevalence : 0.2197
Balanced Accuracy : 0.6800
'Positive' Class : bad
roc_rpart$roc %>% walk(plot)
# gráfico extra ---- cuidado: códigos de R avançados!
roc_plot <- roc_rpart %>%
select(base, roc, auc) %>%
mutate(
roc = map(roc, ~{
.x %>%
unclass %>%
as.data.frame
})
) %>%
unnest %>%
ggplot(aes(x = fpr, y = tpr, colour = base, label = cutoffs)) +
geom_line() +
geom_abline(colour = "grey50") +
theme_minimal() +
coord_fixed()
plotly::ggplotly(roc_plot)
infos <- getModelInfo("ranger", FALSE)$ranger
save(modelo_rf, "modelo_rf.RData")
Error in save(modelo_rf, "modelo_rf.RData") :
objeto ‘modelo_rf.RData’ não encontrado
varImp(modelo_rf)
ranger variable importance
# Predicoes
credit_data <- credit_data %>%
mutate(
pred_rf = predict(modelo_rf, ., type = "prob")$bad
)
# Matriz de confusão
credit_data_teste <- credit_data %>% filter(base %in% "teste")
caret::confusionMatrix(predict(modelo_rf, credit_data_teste), credit_data_teste$Status, mode = "everything")
Confusion Matrix and Statistics
Reference
Prediction bad good
bad 180 79
good 206 919
Accuracy : 0.7941
95% CI : (0.7718, 0.8151)
No Information Rate : 0.7211
P-Value [Acc > NIR] : 2.703e-10
Kappa : 0.4306
Mcnemar's Test P-Value : 8.419e-14
Sensitivity : 0.4663
Specificity : 0.9208
Pos Pred Value : 0.6950
Neg Pred Value : 0.8169
Precision : 0.6950
Recall : 0.4663
F1 : 0.5581
Prevalence : 0.2789
Detection Rate : 0.1301
Detection Prevalence : 0.1871
Balanced Accuracy : 0.6936
'Positive' Class : bad
# Comparacao de modelos
rocs %>%
ggplot(aes(x = auc, y = modelo, colour = base)) +
geom_point(size = 5) +
theme_minimal(30)
# gráfico extra ---- cuidado: códigos de R avançados!
roc_plot <- rocs %>%
select(base, modelo, roc) %>%
mutate(
roc = map(roc, ~{
.x %>%
unclass %>%
as.data.frame
})
) %>%
unnest %>%
ggplot(aes(x = fpr, y = tpr, colour = modelo, label = cutoffs)) +
geom_line() +
geom_abline(colour = "grey50") +
theme_minimal() +
coord_fixed() +
facet_wrap(~base)
plotly::ggplotly(roc_plot)
Exercício: Ajuste um xgboost usando o caret e responda: qual modelo apresenta a maior AUC? crtl+C ctrl+V por sua conta!
DICA 1) troque “ranger” por “xgbTree” DICA 2) rode info <- getModelInfo("xgbTree", FALSE)$xgbTree e depois consulte info$parameters. DICA 3) experimente usar o parâmetro tuneLength = 20 em vez do `tuneGrid.
getModelInfo("xgbTree", FALSE)$xgbTree
$label
[1] "eXtreme Gradient Boosting"
$library
[1] "xgboost" "plyr"
$type
[1] "Regression" "Classification"
$parameters
$grid
function(x, y, len = NULL, search = "grid") {
if(search == "grid") {
out <- expand.grid(max_depth = seq(1, len),
nrounds = floor((1:len) * 50),
eta = c(.3, .4),
gamma = 0,
colsample_bytree = c(.6, .8),
min_child_weight = c(1),
subsample = seq(.5, 1, length = len))
} else {
out <- data.frame(nrounds = sample(1:1000, size = len, replace = TRUE),
max_depth = sample(1:10, replace = TRUE, size = len),
eta = runif(len, min = .001, max = .6),
gamma = runif(len, min = 0, max = 10),
colsample_bytree = runif(len, min = .3, max = .7),
min_child_weight = sample(0:20, size = len, replace = TRUE),
subsample = runif(len, min = .25, max = 1))
out$nrounds <- floor(out$nrounds)
out <- out[!duplicated(out),]
}
out
}
$loop
function(grid) {
loop <- plyr::ddply(grid, c("eta", "max_depth", "gamma",
"colsample_bytree", "min_child_weight",
"subsample"),
function(x) c(nrounds = max(x$nrounds)))
submodels <- vector(mode = "list", length = nrow(loop))
for(i in seq(along = loop$nrounds)) {
index <- which(grid$max_depth == loop$max_depth[i] &
grid$eta == loop$eta[i] &
grid$gamma == loop$gamma[i] &
grid$colsample_bytree == loop$colsample_bytree[i] &
grid$min_child_weight == loop$min_child_weight[i] &
grid$subsample == loop$subsample[i])
trees <- grid[index, "nrounds"]
submodels[[i]] <- data.frame(nrounds = trees[trees != loop$nrounds[i]])
}
list(loop = loop, submodels = submodels)
}
$fit
function(x, y, wts, param, lev, last, classProbs, ...) {
if(!inherits(x, "xgb.DMatrix"))
x <- as.matrix(x)
if(is.factor(y)) {
if(length(lev) == 2) {
y <- ifelse(y == lev[1], 1, 0)
if(!inherits(x, "xgb.DMatrix"))
x <- xgboost::xgb.DMatrix(x, label = y, missing = NA) else
xgboost::setinfo(x, "label", y)
if (!is.null(wts))
xgboost::setinfo(x, 'weight', wts)
out <- xgboost::xgb.train(list(eta = param$eta,
max_depth = param$max_depth,
gamma = param$gamma,
colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight,
subsample = param$subsample),
data = x,
nrounds = param$nrounds,
objective = "binary:logistic",
...)
} else {
y <- as.numeric(y) - 1
if(!inherits(x, "xgb.DMatrix"))
x <- xgboost::xgb.DMatrix(x, label = y, missing = NA) else
xgboost::setinfo(x, "label", y)
if (!is.null(wts))
xgboost::setinfo(x, 'weight', wts)
out <- xgboost::xgb.train(list(eta = param$eta,
max_depth = param$max_depth,
gamma = param$gamma,
colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight,
subsample = param$subsample),
data = x,
num_class = length(lev),
nrounds = param$nrounds,
objective = "multi:softprob",
...)
}
} else {
if(!inherits(x, "xgb.DMatrix"))
x <- xgboost::xgb.DMatrix(x, label = y, missing = NA) else
xgboost::setinfo(x, "label", y)
if (!is.null(wts))
xgboost::setinfo(x, 'weight', wts)
out <- xgboost::xgb.train(list(eta = param$eta,
max_depth = param$max_depth,
gamma = param$gamma,
colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight,
subsample = param$subsample),
data = x,
nrounds = param$nrounds,
objective = "reg:linear",
...)
}
out
}
$predict
function(modelFit, newdata, submodels = NULL) {
if(!inherits(newdata, "xgb.DMatrix")) {
newdata <- as.matrix(newdata)
newdata <- xgboost::xgb.DMatrix(data=newdata, missing = NA)
}
out <- predict(modelFit, newdata)
if(modelFit$problemType == "Classification") {
if(length(modelFit$obsLevels) == 2) {
out <- ifelse(out >= .5,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
} else {
out <- matrix(out, ncol = length(modelFit$obsLevels), byrow = TRUE)
out <- modelFit$obsLevels[apply(out, 1, which.max)]
}
}
if(!is.null(submodels)) {
tmp <- vector(mode = "list", length = nrow(submodels) + 1)
tmp[[1]] <- out
for(j in seq(along = submodels$nrounds)) {
tmp_pred <- predict(modelFit, newdata, ntreelimit = submodels$nrounds[j])
if(modelFit$problemType == "Classification") {
if(length(modelFit$obsLevels) == 2) {
tmp_pred <- ifelse(tmp_pred >= .5,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
} else {
tmp_pred <- matrix(tmp_pred, ncol = length(modelFit$obsLevels), byrow = TRUE)
tmp_pred <- modelFit$obsLevels[apply(tmp_pred, 1, which.max)]
}
}
tmp[[j+1]] <- tmp_pred
}
out <- tmp
}
out
}
$prob
function(modelFit, newdata, submodels = NULL) {
if(!inherits(newdata, "xgb.DMatrix")) {
newdata <- as.matrix(newdata)
newdata <- xgboost::xgb.DMatrix(data=newdata, missing = NA)
}
if( !is.null(modelFit$param$objective) && modelFit$param$objective == 'binary:logitraw'){
p <- predict(modelFit, newdata)
out <-binomial()$linkinv(p) # exp(p)/(1+exp(p))
} else {
out <- predict(modelFit, newdata)
}
if(length(modelFit$obsLevels) == 2) {
out <- cbind(out, 1 - out)
colnames(out) <- modelFit$obsLevels
} else {
out <- matrix(out, ncol = length(modelFit$obsLevels), byrow = TRUE)
colnames(out) <- modelFit$obsLevels
}
out <- as.data.frame(out)
if(!is.null(submodels)) {
tmp <- vector(mode = "list", length = nrow(submodels) + 1)
tmp[[1]] <- out
for(j in seq(along = submodels$nrounds)) {
tmp_pred <- predict(modelFit, newdata, ntreelimit = submodels$nrounds[j])
if(length(modelFit$obsLevels) == 2) {
tmp_pred <- cbind(tmp_pred, 1 - tmp_pred)
colnames(tmp_pred) <- modelFit$obsLevels
} else {
tmp_pred <- matrix(tmp_pred, ncol = length(modelFit$obsLevels), byrow = TRUE)
colnames(tmp_pred) <- modelFit$obsLevels
}
tmp_pred <- as.data.frame(tmp_pred)
tmp[[j+1]] <- tmp_pred
}
out <- tmp
}
out
}
$predictors
function(x, ...) {
imp <- xgboost::xgb.importance(x$xNames, model = x)
x$xNames[x$xNames %in% imp$Feature]
}
$varImp
function(object, numTrees = NULL, ...) {
imp <- xgboost::xgb.importance(object$xNames, model = object)
imp <- as.data.frame(imp)[, 1:2]
rownames(imp) <- as.character(imp[,1])
imp <- imp[,2,drop = FALSE]
colnames(imp) <- "Overall"
missing <- object$xNames[!(object$xNames %in% rownames(imp))]
missing_imp <- data.frame(Overall=rep(0, times=length(missing)))
rownames(missing_imp) <- missing
imp <- rbind(imp, missing_imp)
imp
}
$levels
function(x) x$obsLevels
$tags
[1] "Tree-Based Model" "Boosting"
[3] "Ensemble Model" "Implicit Feature Selection"
[5] "Accepts Case Weights"
$sort
function(x) {
# This is a toss-up, but the # trees probably adds
# complexity faster than number of splits
x[order(x$nrounds, x$max_depth, x$eta, x$gamma, x$colsample_bytree, x$min_child_weight),]
}
train_control_xgb <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
verboseIter = 1,
search = "random"
)
modelo_xgb <- train(
receita,
credit_data %>% filter(base %in% "treino") %>% select(-base),
method = "xgbTree", #PREENCHA AQUI
metric = "ROC",
trControl = train_control_xgb,
tuneGrid = tune_grid_xgb
)
Preparing recipe
+ Fold1: eta=0.010, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
- Fold1: eta=0.010, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
+ Fold1: eta=0.100, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
- Fold1: eta=0.100, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
+ Fold1: eta=0.146, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
- Fold1: eta=0.146, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
+ Fold1: eta=0.500, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
- Fold1: eta=0.500, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
+ Fold1: eta=1.000, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
- Fold1: eta=1.000, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
+ Fold2: eta=0.010, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
- Fold2: eta=0.010, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
+ Fold2: eta=0.100, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
- Fold2: eta=0.100, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
+ Fold2: eta=0.146, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
- Fold2: eta=0.146, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
+ Fold2: eta=0.500, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
- Fold2: eta=0.500, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
+ Fold2: eta=1.000, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
- Fold2: eta=1.000, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200
+ Fold3: eta=0.010, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200